"""
The code is released exclusively for review purposes with the following terms:
PROPRIETARY AND CONFIDENTIAL. UNAUTHORIZED USE, COPYING, OR DISTRIBUTION OF THE 
CODE, VIA ANY MEDIUM, IS STRICTLY PROHIBITED. BY ACCESSING THE CODE, THE 
REVIEWERS AGREE TO DELETE THEM FROM ALL MEDIA AFTER THE REVIEW PERIOD IS OVER.
"""
from sklearn.preprocessing import OneHotEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import sklearn
import pandas as pd
import os
from collections import OrderedDict

#TODO: for tabular  data processing
# Input: type of scaling of numerical features
# output feature names, dummy coded category names

def get_preprocess_adult():
    # Dataset
    feature_names = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num",
                     "Marital Status","Occupation", "Relationship", "Race",
                     "Sex", "Capital Gain", "Capital Loss","Hours per week", "Country"]
    data = np.genfromtxt(
            'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
            delimiter=', ', dtype=str)

    # Labels encoding
    labels = data[:,14]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    data = data[:,:-1]

    # Categorical recoding
    categorical_features = [1,3,5,6,7,8,9,13]
    categorical_names = {}
    for feature in categorical_features:
        le = sklearn.preprocessing.LabelEncoder()
        le.fit(data[:, feature])
        data[:, feature] = le.transform(data[:, feature])
        categorical_names[feature] = le.classes_

    # Convert to float
    data = data.astype(float)

    # Remove weight variable
    data = np.delete(data,2,axis=1)
    categorical_features = [1,2,4,5,6,7,8,12] # This is changed because one of the categorical variable was removed

    # Min max scaling of columns
    for i in range(data.shape[1]):
        if(i not in categorical_features):
            sc = MinMaxScaler(feature_range=(0,1))

            data[:,i:i+1] = sc.fit_transform(data[:,i:i+1])

    # One hot  encoder for  categorical
    encoder = OneHotEncoder(sparse=False)

    # This pushes the non-categorical columns to the end of the dataframe
    columnTransformer = ColumnTransformer([('enc', 
                                            encoder, 
                                            categorical_features)], 
                                        remainder='passthrough') 

    # Transformed data
    data = columnTransformer.fit_transform(data)

    return  data, labels


def AdultDataset():
    # Dataset
    feature_names = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num",
                     "Marital Status","Occupation", "Relationship", "Race",
                     "Sex", "Capital Gain", "Capital Loss","Hours per week", "Country"]
    data = np.genfromtxt(
            'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 
            delimiter=', ', dtype=str)

    # Remove features
    remove_inds = [2]
    data = np.delete(data,remove_inds,axis=1)
    feature_names = [f for (idx, f) in enumerate(feature_names) 
                     if idx not in remove_inds]

    # Create the label
    labels = data[:,13]
    le= sklearn.preprocessing.LabelEncoder()
    le.fit(labels)
    labels = le.transform(labels)
    class_names = le.classes_
    data = data[:,:-1]

    categorical_feature_inds = [1,2,4,5,6,7,8,12]

    # Numerical features
    p = data.shape[1]
    numerical_feature_inds = np.setdiff1d(range(p), categorical_feature_inds)

    
    df = pd.DataFrame(data, columns=feature_names)
    cols_to_keep = (df  == "?").sum(axis = 1) == 0
    df = df.loc[cols_to_keep, :]

    for (idx, colname) in enumerate(feature_names):
        if idx in categorical_feature_inds:
            df[colname] = df[colname].astype("category")
        else:
            df[colname] = df[colname].astype("float")
            
    # Convention to keep numerical features first and then
    # categorical  features in the DF
    df = pd.concat([df.iloc[:, numerical_feature_inds],
                    df.iloc[:, categorical_feature_inds]], 
                    axis = 1)
    categorical_name_inds, categorical_cols = obtain_categorical_name_inds(df)
    
    # For ease of use recode categorical variables
    for (idx, colname) in enumerate(feature_names):
        if idx in categorical_feature_inds:
            df[colname] = df[colname].cat.codes
            df[colname] = df[colname].astype("float")
            df[colname] = df[colname].astype("category")

    return df, labels, categorical_name_inds, categorical_cols


def MEPSDataset_util_19():
    import aif360
    from aif360.datasets import MEPSDataset19

    def custom_preprocessing19(df):
        """
        1.Create a new column, RACE that is 'White' if RACEV2X = 1 and HISPANX = 2 i.e. non Hispanic White
          and 'non-White' otherwise
        2. Restrict to Panel 19
        3. RENAME all columns that are PANEL/ROUND SPECIFIC
        4. Drop rows based on certain values of individual features that correspond to missing/unknown - generally < -1
        5. Compute UTILIZATION, binarize it to 0 (< 10) and 1 (>= 10)
        """
        def race(row):
            if ((row['HISPANX'] == 2) and (row['RACEV2X'] == 1)):  #non-Hispanic Whites are marked as WHITE; all others as NON-WHITE
                return 'White'
            return 'Non-White'

        df['RACEV2X'] = df.apply(lambda row: race(row), axis=1)
        df = df.rename(columns = {'RACEV2X' : 'RACE'})

        df = df[df['PANEL'] == 19]

        # RENAME COLUMNS
        df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH',
                                  'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT',
                                  'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM',
                                  'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE',
                                  'POVCAT15' : 'POVCAT', 'INSCOV15' : 'INSCOV', 'OBTOTV15': 'OBTOTV', 'OPTOTV15': 'OPTOTV',
                                 'ERTOT15': 'ERTOT', 'IPNGTD15': 'IPNGTD', 'HHTOTD15': 'HHTOTD'})

        df = df[df['REGION'] >= 0] # remove values -1
        df = df[df['AGE'] >= 0] # remove values -1

        df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9

        df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9

        df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG',
                                 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
                                 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
                                 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
                                 'PHQ242','EMPST','POVCAT','INSCOV']] >= -1).all(1)]  #for all other categorical features, remove values < -1

        def utilization(row):
            return row['OBTOTV'] + row['OPTOTV'] + row['ERTOT'] + row['IPNGTD'] + row['HHTOTD']

        df['TOTEXP15'] = df.apply(lambda row: utilization(row), axis=1)
        lessE = df['TOTEXP15'] < 10.0
        df.loc[lessE,'TOTEXP15'] = 0.0
        moreE = df['TOTEXP15'] >= 10.0
        df.loc[moreE,'TOTEXP15'] = 1.0

        df = df.rename(columns = {'TOTEXP15' : 'UTILIZATION'})
    #     df = df.rename(columns = {'DFHEAR42' : 'DFHEAR'})
    #     df = df.rename(columns = {'DFSEE32' : 'DFSEE'})
    #     df = df.rename(columns = {'ADSMOK42' : 'ADSMOK'})

        return df

    features_to_keep19=['REGION','AGE','SEX','RACE','MARRY',
            'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX',
            'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX',
            'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM',
            'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42',
            'EMPST','POVCAT','INSCOV','OBTOTV',
            'OPTOTV','ERTOT','IPNGTD','HHTOTD','UTILIZATION','PERWT15F']

    data19_orig, attr19 = MEPSDataset19(features_to_keep=features_to_keep19,
                         custom_preprocessing=custom_preprocessing19
                         ).convert_to_dataframe(de_dummy_code=True)

    data19_orig["RACE"] = data19_orig["RACE"].astype("category")
    # Filter and recode features
    # data19["AGE"]
    age_bins = [0, 17, 24, 44, 64, 85]
    data19_orig["AGE_BIN"] = pd.cut(data19_orig["AGE"], age_bins)

    # Drop features
    features_to_drop = ["AGE", "REGION", "MARRY", "FTSTU", "ACTDTY", 
                        "HONRDC", "UTILIZATION", "PHQ242"]
    data19_orig = data19_orig.drop(columns=features_to_drop)

    # rename columns
    data19_orig = data19_orig.rename(columns = {'DFHEAR42': "DFHEAR",
                                  'DFSEE42': "DFSEE",
                                  'ADSMOK42': "ADSMOK"})

    # X and y columns
    y_cols = ['OBTOTV','OPTOTV','ERTOT','IPNGTD','HHTOTD']
    X_cols = [f for f in data19_orig.columns
                 if f not in y_cols]

    X_nodum_df = data19_orig.loc[:, X_cols]

    X_nodum_df["RACE"] = X_nodum_df["RACE"].cat.codes
    X_nodum_df["RACE"] = X_nodum_df["RACE"].astype("category")

    X_nodum_df["AGE_BIN"] = X_nodum_df["AGE_BIN"].cat.codes
    X_nodum_df["AGE_BIN"] = X_nodum_df["AGE_BIN"].astype("category")

    for c in X_nodum_df.columns:
        X_nodum_df[c] = X_nodum_df[c].cat.rename_categories(
                            np.array(X_nodum_df[c].cat.categories, 
                            dtype=np.float64))

    X_df = pd.get_dummies(X_nodum_df, prefix_sep="=")
    y_df = data19_orig.loc[:, y_cols].sum(axis=1)

    categorical_feature_names = list(X_nodum_df.columns)
    categorical_feature_name_inds =  {
                    c: list(X_nodum_df[c].cat.categories)
                    for c in categorical_feature_names}
    
    return (X_df, X_nodum_df, y_df, attr19["instance_weights"], 
            categorical_feature_names,  categorical_feature_name_inds)

def AutoMPGDataset1():
    from AutoMPG_dataset import AutoMPGDataset

    # Load autompgs data
    ds = AutoMPGDataset()
    data, labels, feature_names, target_name, categorical_feature_inds = ds.data()

    # Numerical features
    p = data.shape[1]
    numerical_feature_inds = np.setdiff1d(range(p), categorical_feature_inds)

    df = pd.DataFrame(data, columns=feature_names)
    cols_to_keep = (df  == "?").sum(axis = 1) == 0
    df = df.loc[cols_to_keep, :]

    for (idx, colname) in enumerate(feature_names):
        if idx in categorical_feature_inds:
            df[colname] = df[colname].astype("category")
        else:
            df[colname] = df[colname].astype("float")

    # Convention to keep numerical features first and then
    # categorical  features in the DF
    df = pd.concat([df.iloc[:, numerical_feature_inds],
                    df.iloc[:, categorical_feature_inds]], 
                    axis = 1)
    categorical_name_inds, categorical_cols = obtain_categorical_name_inds(df)

    # For ease of use recode categorical variables
    for (idx, colname) in enumerate(feature_names):
        if idx in categorical_feature_inds:
            df[colname] = df[colname].cat.codes
            df[colname] = df[colname].astype("float")
            df[colname] = df[colname].astype("category")

    return df, labels, categorical_name_inds, categorical_cols

def obtain_categorical_name_inds(df):
    return ({c: list(df[c].cat.categories)
        for (idx, c) in enumerate(df.columns)
        if str(df[c].dtype) == "category"},
        [c
        for c in df.columns
        if str(df[c].dtype) == "category"])


def meps_data(util_thresh=1.0, train_size=0.8, random_seed_data_split=6750):

    # Load the dataset
    (X_df0, X_nodum_df0, y_df0, sample_weights0, 
    categorical_feature_names,  categorical_feature_name_inds) = MEPSDataset_util_19()

    # Numerical and categorical feature inds
    feature_names = list(X_nodum_df0.columns)
    categorical_feature_names= list(categorical_feature_name_inds.keys())
    numerical_feature_names = [f for f in feature_names
                            if f not in categorical_feature_names]

    numerical_feature_inds = [idx for (idx, c) in enumerate(X_df0.columns)
                                if c in numerical_feature_names]
    categorical_feature_inds = [idx for (idx, c) in enumerate(X_df0.columns)
                                if c not in numerical_feature_names]

    # Preprocess dataset
    # util_thresh = 1
    X_df = X_nodum_df0.loc[y_df0 >= util_thresh, :]
    y_df = y_df0.loc[y_df0 >= util_thresh]

    sample_weights = sample_weights0[y_df0 >= util_thresh]
    sum(sample_weights), sum(sample_weights)/sum(sample_weights0)

    # print(X_df.shape, y_df.shape)

    # plt.hist(np.log10(y_df), bins=100);

    X = X_df
    y = np.log10(y_df.values)

    colnames_orig = list(X_nodum_df0.columns)
    colnames_onehot = list(X_df0.columns)

    # Train/test split
    (X_train, X_test, y_train, y_test, 
                w_train, w_test) = sklearn.model_selection.train_test_split(
                                X, y, sample_weights,
                                train_size=train_size,
                                shuffle=True, 
                                random_state=random_seed_data_split)

    return ((X_train, X_test, y_train, y_test, w_train, w_test),
            categorical_feature_names, numerical_feature_names,
            categorical_feature_inds, numerical_feature_inds,
            colnames_onehot, colnames_orig)

def iris_data(train_size=0.8, random_seed_data_split=5000):
    
    iris = sklearn.datasets.load_iris()
    X = pd.DataFrame(iris.data, columns=iris.feature_names)
    y = iris.target
    sample_weights = np.ones(len(y))


    numerical_feature_names = list(X.columns)
    categorical_feature_names = []
    numerical_feature_inds = list(range(4))
    categorical_feature_inds = []

    colnames_onehot = list(X.columns)
    colnames_orig = list(X.columns)

    # Train/test split
    (X_train, X_test, y_train, y_test, 
                w_train, w_test) = sklearn.model_selection.train_test_split(
                                X, y, sample_weights,
                                train_size=train_size,
                                shuffle=True, 
                                random_state=random_seed_data_split)

    return ((X_train, X_test, y_train, y_test, w_train, w_test),
                categorical_feature_names, numerical_feature_names,
                categorical_feature_inds, numerical_feature_inds,
                colnames_onehot, colnames_orig)